library(xgboost)
library(data.table)
library(cplm)

data(AutoClaim)

# auto insurance dataset analyzed by Yip and Yau (2005)
dt <- data.table(AutoClaim)

# exclude these columns from the model matrix
exclude <-  c('POLICYNO', 'PLCYDATE', 'CLM_FREQ5', 'CLM_AMT5', 'CLM_FLAG', 'IN_YY')

# retains the missing values
# NOTE: this dataset is comes ready out of the box
options(na.action = 'na.pass')
x <- sparse.model.matrix(~ . - 1, data = dt[, -exclude, with = F])
options(na.action = 'na.omit')

# response
y <- dt[, CLM_AMT5]

d_train <- xgb.DMatrix(data = x, label = y, missing = NA)

# the tweedie_variance_power parameter determines the shape of 
# distribution
# - closer to 1 is more poisson like and the mass
#   is more concentrated near zero 
# - closer to 2 is more gamma like and the mass spreads to the 
#   the right with less concentration near zero

params <- list(
  objective = 'reg:tweedie',
  eval_metric = 'rmse', 
  tweedie_variance_power = 1.4,
  max_depth = 6,
  eta = 1)

bst <- xgb.train(
  data = d_train, 
  params = params, 
  maximize = FALSE,
  watchlist = list(train = d_train), 
  nrounds = 20)

var_imp <- xgb.importance(attr(x, 'Dimnames')[[2]], model = bst)

preds <- predict(bst, d_train)

rmse <- sqrt(sum(mean((y - preds)^2)))